In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
In [4]:
bank_df = pd.read_csv("../../data/bank_small.csv")
bank_df
Out[4]:
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 30 unemployed married primary no 1787 no no cellular 19 oct 79 1 -1 0 unknown no
1 33 services married secondary no 4789 yes yes cellular 11 may 220 1 339 4 failure no
2 35 management single tertiary no 1350 yes no cellular 16 apr 185 1 330 1 failure no
3 30 management married tertiary no 1476 yes yes unknown 3 jun 199 4 -1 0 unknown no
4 59 blue-collar married secondary no 0 yes no unknown 5 may 226 1 -1 0 unknown no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4516 33 services married secondary no -333 yes no cellular 30 jul 329 5 -1 0 unknown no
4517 57 self-employed married tertiary yes -3313 yes yes unknown 9 may 153 1 -1 0 unknown no
4518 57 technician married secondary no 295 no no cellular 19 aug 151 11 -1 0 unknown no
4519 28 blue-collar married secondary no 1137 no no cellular 6 feb 129 4 211 3 other no
4520 44 entrepreneur single tertiary no 1136 yes yes cellular 3 apr 345 2 249 7 other no

4521 rows × 17 columns

In [5]:
# Citation Request:
# This dataset is public available for research. The details are described 
# in [Moro et al., 2011]. Please include this citation if you plan to use this database:
#   
# [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank 
# Direct Marketing: An Application of the CRISP-DM Methodology. 
# In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling 
# Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.
# 
# Available at: [pdf] http://hdl.handle.net/1822/14838
# [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt
# 
# 1. Title: Bank Marketing
# 
# 2. Sources
# Created by: Paulo Cortez (Univ. Minho) and Sérgio Moro (ISCTE-IUL) @ 2012
# 
# 3. Past Usage:
# 
# The full dataset was described and analyzed in:
# 
# S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
# In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, 
# Portugal, October, 2011. EUROSIS.
# 
# 4. Relevant Information:
#   
# The data is related with direct marketing campaigns of a Portuguese banking institution. 
# The marketing campaigns were based on phone calls. Often, more than one contact to the 
# same client was required, in order to access if the product (bank term deposit) 
# would be (or not) subscribed. 
# 
# There are two datasets: 
# 1) bank-full.csv with all examples, ordered by date (from May 2008 to November 2010).
# 2) bank.csv with 10% of the examples (4521), randomly selected from bank-full.csv.
# The smallest dataset is provided to test more computationally demanding machine 
# learning algorithms (e.g. SVM).
# 
# The classification goal is to predict if the client will subscribe a term deposit (variable y).
# 
# 5. Number of Instances: 45211 for bank-full.csv (4521 for bank.csv)
# 
# 6. Number of Attributes: 16 + output attribute (y).
# 
# 7. Attribute information:
#   
#   For more information, read [Moro et al., 2011].
# 
# Input variables:
# # bank client data:
# 1 - age (numeric)
# 2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid",
# "entrepreneur","student","blue-collar","self-employed","retired","technician","services") 
# 3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
# 4 - education (categorical: "unknown","secondary","primary","tertiary")
# 5 - default: has credit in default? (binary: "yes","no")
# 6 - balance: average yearly balance, in euros (numeric) 
# 7 - housing: has housing loan? (binary: "yes","no")
# 8 - loan: has personal loan? (binary: "yes","no")
#
# # related with the last contact of the current campaign:
# 9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 
# 10 - day: last contact day of the month (numeric)
# 11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
# 12 - duration: last contact duration, in seconds (numeric)
#
# # other attributes:
# 13 - campaign: number of contacts performed during this campaign and for this client
# (numeric, includes last contact)
# 14 - pdays: number of days that passed by after the client was last contacted from 
# a previous campaign (numeric, -1 means client was not previously contacted)
# 15 - previous: number of contacts performed before this campaign and for this client (numeric)
# 16 - poutcome: outcome of the previous marketing campaign
# (categorical: "unknown","other","failure","success")
# 
# Output variable (desired target):
# 17 - y - has the client subscribed a term deposit? (binary: "yes","no")
In [6]:
bank_df.describe()
Out[6]:
age balance day duration campaign pdays previous
count 4521.000000 4521.000000 4521.000000 4521.000000 4521.000000 4521.000000 4521.000000
mean 41.170095 1422.657819 15.915284 263.961292 2.793630 39.766645 0.542579
std 10.576211 3009.638142 8.247667 259.856633 3.109807 100.121124 1.693562
min 19.000000 -3313.000000 1.000000 4.000000 1.000000 -1.000000 0.000000
25% 33.000000 69.000000 9.000000 104.000000 1.000000 -1.000000 0.000000
50% 39.000000 444.000000 16.000000 185.000000 2.000000 -1.000000 0.000000
75% 49.000000 1480.000000 21.000000 329.000000 3.000000 -1.000000 0.000000
max 87.000000 71188.000000 31.000000 3025.000000 50.000000 871.000000 25.000000
In [7]:
seed = 123 # for reproducibility

X = bank_df.drop('y', axis=1)  # Features (all columns except 'y')
y = bank_df['y']  # Target variable

# We need to encode the categorical variables
X = pd.get_dummies(data = X, drop_first = True)

#split our dataset into training and test set using the outcome variable as stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = seed)
In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Define the SVM classifier and parameter grid for tuning
svm = SVC(kernel='rbf', random_state = seed)  # Radial basis function kernel
param_grid = {'C': [0.25, 0.50, 1, 2, 4, 8, 16, 32, 64, 128]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "mean_test_score"]]
results_df
Out[9]:
param_C mean_test_score
0 0.25 0.885510
1 0.5 0.886891
2 1 0.888829
3 2 0.892151
4 4 0.893527
5 8 0.892977
6 16 0.889103
7 32 0.882746
8 64 0.881916
9 128 0.875831
In [10]:
ax = results_df.plot(x = "param_C", y = "mean_test_score", marker = '.')
ax.set_xlabel("Cost")
ax.set_ylabel("Accuracy (cross-validation)")
Out[10]:
Text(0, 0.5, 'Accuracy (cross-validation)')
In [11]:
# sigma = gamma in SVC
# *This cell takes some time to run*

svm = SVC(kernel='rbf', random_state = seed)  # Radial basis function kernel
param_grid = {'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'gamma': [0.016, 0.017, 0.018, 0.019, 0.020]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
Out[11]:
param_C param_gamma mean_test_score
0 1 0.016 0.889658
1 1 0.017 0.889659
2 1 0.018 0.889106
3 1 0.019 0.889106
4 1 0.02 0.888276
5 2 0.016 0.888281
6 2 0.017 0.888004
7 2 0.018 0.888559
8 2 0.019 0.888558
9 2 0.02 0.888281
10 3 0.016 0.892431
11 3 0.017 0.891877
12 3 0.018 0.892151
13 3 0.019 0.892981
14 3 0.02 0.891598
15 4 0.016 0.892427
16 4 0.017 0.892981
17 4 0.018 0.894086
18 4 0.019 0.893809
19 4 0.02 0.893809
20 5 0.016 0.893810
21 5 0.017 0.894363
22 5 0.018 0.894360
23 5 0.019 0.892976
24 5 0.02 0.891870
25 6 0.016 0.895742
26 6 0.017 0.894911
27 6 0.018 0.894358
28 6 0.019 0.894081
29 6 0.02 0.892975
30 7 0.016 0.896016
31 7 0.017 0.894357
32 7 0.018 0.893804
33 7 0.019 0.892421
34 7 0.02 0.891867
35 8 0.016 0.894080
36 8 0.017 0.892974
37 8 0.018 0.892697
38 8 0.019 0.891868
39 8 0.02 0.891593
40 9 0.016 0.893527
41 9 0.017 0.892421
42 9 0.018 0.891592
43 9 0.019 0.891870
44 9 0.02 0.892424
45 10 0.016 0.893252
46 10 0.017 0.892421
47 10 0.018 0.891869
48 10 0.019 0.892977
49 10 0.02 0.892147
In [12]:
import plotly.express as px

fig = px.line(results_df, x = "param_C", y = "mean_test_score", color = "param_gamma", symbol = "param_gamma")
fig
In [10]:
# best model
grid_search.best_estimator_
Out[10]:
SVC(C=7, gamma=0.016, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=7, gamma=0.016, random_state=123)
In [11]:
from sklearn.metrics import confusion_matrix

y_pred = grid_search.best_estimator_.predict(X_train_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_train, y_pred)
conf_matrix
Out[11]:
array([[3187,   12],
       [ 130,  287]], dtype=int64)
In [12]:
grid_search.classes_
Out[12]:
array(['no', 'yes'], dtype=object)
In [13]:
# Making the visualization of the confusion matrix better
from sklearn.metrics import ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
Out[13]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x243faf8a820>
In [19]:
#Model validation on the test set
X_test_scaled = scaler.fit_transform(X_test)

y_pred = grid_search.best_estimator_.predict(X_test_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
Out[19]:
array([[777,  24],
       [ 69,  35]], dtype=int64)
In [20]:
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
Out[20]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f2e14d7070>
In [21]:
import numpy as np
In [27]:
# *This cell takes time to run*
param_grid = {
    'gamma': [10 ** i for i in range(-5, 0)],
    'C': [10 ** i for i in range(-3, 2)]
}
svm = SVC(kernel='rbf', random_state = seed)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
In [29]:
best_model
Out[29]:
SVC(C=10, gamma=0.01, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=10, gamma=0.01, random_state=123)
In [28]:
# All results
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
Out[28]:
param_C param_gamma mean_test_score
0 0.001 0.00001 0.884680
1 0.001 0.0001 0.884680
2 0.001 0.001 0.884680
3 0.001 0.01 0.884680
4 0.001 0.1 0.884680
5 0.01 0.00001 0.884680
6 0.01 0.0001 0.884680
7 0.01 0.001 0.884680
8 0.01 0.01 0.884680
9 0.01 0.1 0.884680
10 0.1 0.00001 0.884680
11 0.1 0.0001 0.884680
12 0.1 0.001 0.884680
13 0.1 0.01 0.884680
14 0.1 0.1 0.884680
15 1 0.00001 0.884680
16 1 0.0001 0.884680
17 1 0.001 0.891867
18 1 0.01 0.890486
19 1 0.1 0.886065
20 10 0.00001 0.884680
21 10 0.0001 0.891867
22 10 0.001 0.891590
23 10 0.01 0.897955
24 10 0.1 0.879977
In [33]:
#it looks like \gamma=0.01-0.1 and C=1-10 does better job
# *This cell takes time to run*
param_grid = {
    'gamma': np.arange(0.01, 0.13, 0.01),
    'C': np.arange(1,13,1)
}
svm = SVC(kernel='rbf', random_state = seed)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_model
Out[33]:
SVC(C=10, gamma=0.01, random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=10, gamma=0.01, random_state=123)
In [34]:
# All results
results_df = pd.DataFrame(grid_search.cv_results_)[["param_C", "param_gamma", "mean_test_score"]]
results_df
Out[34]:
param_C param_gamma mean_test_score
0 1 0.01 0.890486
1 1 0.02 0.888276
2 1 0.03 0.889107
3 1 0.04 0.889107
4 1 0.05 0.889383
... ... ... ...
139 12 0.08 0.878872
140 12 0.09 0.880530
141 12 0.1 0.879148
142 12 0.11 0.878872
143 12 0.12 0.879703

144 rows × 3 columns

In [45]:
# best accuracy
results_df[results_df["mean_test_score"] == results_df["mean_test_score"].max()]
Out[45]:
param_C param_gamma mean_test_score
108 10 0.01 0.897955
In [55]:
#Model validation on the test set
y_pred = grid_search.best_estimator_.predict(X_test_scaled) # Get predictions from the best model
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix
Out[55]:
array([[782,  19],
       [ 67,  37]], dtype=int64)
In [56]:
# Identify the order of the classes.
grid_search.classes_
Out[56]:
array(['no', 'yes'], dtype=object)
In [57]:
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=["No", "Yes"])
disp.plot()
Out[57]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f2e16bbca0>
In [58]:
# Accuracy. It seems that it performs even better on the test set.
accuracy = np.mean(y_pred == y_test)
accuracy
Out[58]:
0.9049723756906077
In [67]:
# 95% Confidence Interval
from statsmodels.stats.proportion import proportion_confint

confidence_interval = proportion_confint(count=np.sum(y_pred == y_test), nobs=len(y_pred))
confidence_interval
Out[67]:
(0.88586652415365, 0.9240782272275655)